{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "编码: [8, 7, 10, 10, 12]\n",
      "解码: hello\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "import torch.nn.functional as F\n",
    "import numpy as np\n",
    "\n",
    "class SimpleTokenizer:\n",
    "    def __init__(self, text):\n",
    "        # 统计字符出现频率，构建词表\n",
    "        self.chars = sorted(set(text))\n",
    "        self.vocab_size = len(self.chars)\n",
    "        self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}\n",
    "        self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}\n",
    "    \n",
    "    def encode(self, text):\n",
    "        return [self.char_to_idx[ch] for ch in text]\n",
    "    \n",
    "    def decode(self, indices):\n",
    "        return ''.join([self.idx_to_char[idx] for idx in indices])\n",
    "\n",
    "# 示例\n",
    "text = \"hello world! this is a simple LLM.\"\n",
    "tokenizer = SimpleTokenizer(text)\n",
    "encoded = tokenizer.encode(\"hello\")\n",
    "print(\"编码:\", encoded)\n",
    "print(\"解码:\", tokenizer.decode(encoded))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
